
## Significant Incidents Against Americans Abroad: Introducing a New Dataset

rm(list=ls())

# Library
library(readxl)
library(dplyr)
library(tidyr)
library(ggplot2)
library(tidyverse)
library(lubridate)
library(lmtest)
library(sandwich)
library(MASS)


# Set the file path 
file_path <- "...\\SIAAA.xlsx"

# Read the Excel file into R and name it 'mydata'
mydata <- read_excel(file_path)

## Create A new variable for each observation
mydata$Total <- 1


### Figure 1
## Descriptive time series
#Calculate the total incidents per year for the "Total" variable
mydata_total <- mydata %>%
  group_by(Year) %>%
  summarise(TotalIncidents = sum(Total, na.rm = TRUE), .groups = "drop")

# Calculate the mean
mean_incidents <- mean(mydata_total$TotalIncidents, na.rm = TRUE)

# Plotting the data
ggplot(mydata_total, aes(x = Year, y = TotalIncidents)) +
  geom_line(color = "black",size = 1) +            # Time series line
  geom_hline(yintercept = mean_incidents, # Horizontal mean line
             color = "gray", 
             linetype = "dashed", 
             size = 1.2) +
  geom_text(aes(x = max(mydata_total$Year) - 2, # Move the label towards the right side of the graph
                y = mean_incidents + (max(mydata_total$TotalIncidents) * 0.09),  # Adjust y value for the position a bit above the line
                label = sprintf("Mean Score: %.2f", mean_incidents)), 
            color = "black", hjust = "right", check_overlap = TRUE) +
  labs(y = "Incidents", 
       x = "Year") +
  theme_minimal() +                      # Minimal theme
  theme(panel.background = element_rect(fill = "white"),  # White background
        panel.grid.major = element_line(color = "grey90"), # Major grid lines
        panel.grid.minor = element_blank(),              # No minor grid lines
        axis.text = element_text(color = "black"),       # Text color
        axis.title = element_text(color = "black"))      # Title color

######################

## Table 1
## Types of anti-Americanism

# Calculate counts and percentages for each category
mydata_summary <- mydata %>%
  summarise(
    Bussiness_count = sum(Business, na.rm = TRUE),
    Bussiness_pct = mean(Business, na.rm = TRUE) * 100,
    Government_count = sum(Government, na.rm = TRUE),
    Government_pct = mean(Government, na.rm = TRUE) * 100,
    Military_count = sum(Military, na.rm = TRUE),
    Military_pct = mean(Military, na.rm = TRUE) * 100,
    Private_count = sum(Private, na.rm = TRUE),
    Private_pct = mean(Private, na.rm = TRUE) * 100
  )

# Print the summary
print(mydata_summary)


### Figure 2
## Regional patterns
# Compute the cumulative sum for the stacked line graph
grouped_data <- mydata %>%
  group_by(Year, RegionWB) %>%
  summarise(Incidents = sum(Total, na.rm = TRUE)) %>%
  ungroup()

# Using ggplot2 to create the stacked area chart
ggplot(grouped_data, aes(x = Year, y = Incidents, fill = RegionWB)) +
  geom_area(position = 'stack', alpha = 0.7) +
  labs(title = "",
       x = "Year", y = "Incidents") +
  scale_fill_brewer(palette = "Set3", name = "") +
  theme_minimal() +
  theme(legend.text = element_text(size = 7))  # Adjust the size as necessary


##Figure 4
### Type of action

# 1. Aggregate by Action_type
action_data <- mydata %>%
  group_by(Action_type) %>%
  summarise(Incidents = sum(Total, na.rm = TRUE)) %>%
  arrange(-Incidents) %>%
  ungroup()

# 2. Plot with gradual color
ggplot(action_data, aes(x = reorder(Action_type, Incidents), y = Incidents, fill = Incidents)) + 
  geom_bar(stat = "identity", alpha = 0.8) +
  scale_fill_viridis_c(trans = "sqrt", direction = -1) +  # Using a color gradient based on frequency
  coord_flip() +
  labs(title = "",
       x = "Action Type", 
       y = "Incidents") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 10))


###Table 2
## Ideology #####

# Calculate the frequency and percentage for each ideological category
summary_data <- mydata %>%
  group_by(Actor_ideology) %>%
  summarise(Frequency = n()) %>%
  mutate(Percentage = (Frequency/sum(Frequency))*100)

# Print the result
print(summary_data)


## Figure 5
# Group data by Year and Actor_ideology 
filtered_data <- mydata %>%
  filter(Actor_ideology %in% c("Islamic", "Leftist", "Nationalist")) %>%
  group_by(Year, Actor_ideology) %>%
  summarise(Incident_Count = n())

head(filtered_data)



# Define custom colors
custom_colors <- c("Islamic" = "#099463", "Leftist" = "#d10d0d", "Nationalist" = "#0e5ce3")

# Plotting
p <- ggplot(filtered_data, aes(x = Year, y = Incident_Count, fill = Actor_ideology)) +
  geom_ribbon(aes(ymin = 0, ymax = Incident_Count), alpha = 0.5) +
  labs(title = "", 
       y = "Incidents", 
       x = "Year",
       fill = NULL) +  # Set fill to NULL to remove the word "Ideology" from the legend
  facet_wrap(~ Actor_ideology, scales = "free_y") +
  theme_minimal() +
  theme(legend.position = "top",
        axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_manual(values = custom_colors)  # Add this line to specify custom colors

# Ensure Nationalist y-axis labels are whole numbers
if ("Nationalist" %in% unique(filtered_data$Actor_ideology)) {
  p <- p + scale_y_continuous(labels = scales::number_format(accuracy = 1))
}

print(p)



## Figure 6
### Regression discontinuity

# Replace 'Unknown' with NA
mydata$Date[mydata$Date == "Unknown"] <- NA

# Convert numeric dates to R's Date format
mydata$Date <- as.Date(as.numeric(mydata$Date), origin = "1900-01-01")
mydata$Date <- mydata$Date - 2

# Create a data frame with all dates from Jan 1 to June 13
all_dates <- data.frame(Date = seq.Date(from = as.Date("1999-01-01"), to = as.Date("1999-06-13"), by = "day"))

# Aggregate the data by date and get the sum of incidents on each day
aggregated_data <- mydata %>%
  group_by(Date) %>%
  summarize(TotalIncidents = sum(Total))

# Merge the two data frames
full_daily_data <- left_join(all_dates, aggregated_data, by = "Date")

# Replace NAs with 0
full_daily_data$TotalIncidents[is.na(full_daily_data$TotalIncidents)] <- 0

# Create a days variable
full_daily_data$Days <- as.numeric(full_daily_data$Date - as.Date("1999-01-01"))

# Set the treatment date
treatment_date <- as.Date("1999-03-24")

# Create a quadratic term for days
full_daily_data$Days2 <- full_daily_data$Days^2

# Calculate the fitted values using quadratic regression for both pre and post-treatment
model_pre_quad <- lm(TotalIncidents ~ Days + Days2, data = subset(full_daily_data, Date < treatment_date))
model_post_quad <- lm(TotalIncidents ~ Days + Days2, data = subset(full_daily_data, Date > treatment_date))
full_daily_data$Fitted[full_daily_data$Date < treatment_date] <- predict(model_pre_quad)
full_daily_data$Fitted[full_daily_data$Date >= treatment_date] <- predict(model_post_quad)

# Plotting
ggplot(full_daily_data, aes(x = Date, y = TotalIncidents)) +
  geom_point(aes(color = ifelse(Date < treatment_date, "Before Bombing", "After Bombing")), size = 2) +
  geom_vline(aes(xintercept = as.numeric(treatment_date)), linetype = "dashed", color = "red", size = 1) +
  
  # Curvy Fitted lines for pre-treatment
  geom_line(data = subset(full_daily_data, Date < treatment_date), aes(y = Fitted, color = "Fitted Line"), size = 1) +
  
  # Curvy Fitted lines for post-treatment
  geom_line(data = subset(full_daily_data, Date > treatment_date), aes(y = Fitted, color = "Fitted Line"), size = 1) +
  
  labs(title = "",
       x = "Date",
       y = "Incidents",
       color = "") +  
  theme_minimal() +
  scale_color_manual(values = c("Before Bombing" = "#008080", "After Bombing" = "#4682B4", "Fitted Line" = "#C0C0C0")) +
  guides(color = guide_legend(override.aes = list(size = c(2, 2, 1))))


## Table 3
##Regression 
# Negative binomial regression with Days and Days2 as predictors
full_daily_data$PostTreatment <- ifelse(full_daily_data$Date >= treatment_date, 1, 0)

# Negative binomial regression with PostTreatment as the predictor
model_nb1 <- glm.nb(TotalIncidents ~ PostTreatment, data = full_daily_data)
summary(model_nb1)

# Number of incidents from the previous day) to account for autocorrelation
full_daily_data$LaggedIncidents <- lag(full_daily_data$TotalIncidents, order_by = full_daily_data$Date)

# Ocalan Arrest
full_daily_data$OcalanArrest <- ifelse(full_daily_data$Date >= as.Date("1999-02-16") & 
                                         full_daily_data$Date <= as.Date("1999-04-16"), 1, 0)
## Weekends
full_daily_data$Weekend <- ifelse(weekdays(full_daily_data$Date) %in% c("Saturday", "Sunday"), 1, 0)

# Model 1
model_nb1 <- glm.nb(TotalIncidents ~ PostTreatment, data = full_daily_data)

# Summary of the model
summary(model_nb1)

# Model 2
model_nb2 <- glm.nb(TotalIncidents ~ PostTreatment + OcalanArrest + Weekend + LaggedIncidents, data = full_daily_data)

# Summary of the model
summary(model_nb2)


missing_rows <- apply(is.na(full_daily_data), 1, any)
which(missing_rows)



###############################
############    Appendix   ###########
###############################
# Define the file path
new_file_path <- "...\\Crossvalidation.xlsx"

# Read the Excel file
cross <- read_excel(new_file_path)


## Appendix 1
# Insert rows with NA values for 2003-2007
missing_years <- data.frame(Year = 2003:2007, GTD = NA, SIA = NA)
cross <- rbind(cross %>% filter(Year < 2003), missing_years, cross %>% filter(Year > 2007))

# Reshape the data to longer format
data_long_na <- cross %>% pivot_longer(cols = c(GTD, SIA), names_to = "Variable", values_to = "Value")

# Define the correlation text
cor_text <- paste("Correlation: ", round(0.9001424, 6))

# Create the time series plot
ggplot(data_long_na, aes(x = Year, y = Value, color = Variable, group = Variable)) +
  geom_line(size = 1, na.rm = TRUE) + 
  labs(x = "Year", y = "Observation", color = "Dataset",
       title = "GTD and SIAAA Comparison: Philippines",
       subtitle = "Data from 2003-2007 Removed") +
  theme_bw() + 
  scale_color_manual(values = c("GTD" = "blue", "SIA" = "red"), 
                     labels = c("GTD", "SIAAA")) +
  theme(
    plot.title = element_text(face = "bold"), 
    plot.subtitle = element_text(face = "italic"),
    axis.title = element_text(face = "bold"),
    legend.title = element_text(face = "bold"),
    legend.position = "bottom",
    panel.grid.major = element_line(linetype = "dashed", color = "grey"),
    plot.background = element_rect(fill = "white", colour = "black", size = 0.5)
  ) +
  geom_text(aes(x = median(data_long_na$Year), 
                y = max(data_long_na$Value, na.rm = TRUE), 
                label = cor_text), 
            vjust = 1.5, hjust = -.5, color = "black", size = 2.5)





# Calculate correlation
correlation <- cor(cross$GTD, cross$SIA, use="complete.obs")

# Print the correlation
print(correlation)


# Import the 'Greece' sheet from the Excel file
data_Greece <- read_excel(new_file_path, sheet = "Greece")

# Insert rows with NA values for 2003-2007
missing_years <- data.frame(Year = 2003:2007, GTD = NA, SIA = NA)
data_Greece <- rbind(data_Greece %>% filter(Year < 2003), missing_years, data_Greece %>% filter(Year > 2007))

# Convert to long format
data_long_greece <- data_Greece %>% 
  pivot_longer(cols = c("GTD", "SIA"), names_to = "Variable", values_to = "Value")

# Define the correlation text
cor_text <- paste("Correlation: ", round(0.8770984, 6))

# Create the time series plot
ggplot(data_long_greece, aes(x = Year, y = Value, color = Variable, group = Variable)) +
  geom_line(size = 1, na.rm = TRUE) + 
  labs(x = "Year", 
       y = "Observation", 
       color = "Dataset",
       title = "GTD and SIAAA Comparison: Greece",
       subtitle = "Data from 2003-2007 Removed") +
  theme_bw() + 
  scale_color_manual(values = c("GTD" = "blue", "SIA" = "red"), 
                     labels = c("GTD", "SIAAA")) +
  theme(
    plot.title = element_text(face = "bold"), 
    plot.subtitle = element_text(face = "italic"),
    axis.title = element_text(face = "bold"),
    legend.title = element_text(face = "bold"),
    legend.position = "bottom",
    panel.grid.major = element_line(linetype = "dashed", color = "grey"),
    plot.background = element_rect(fill = "white", colour = "black", size = 1)
  ) +
  geom_text(aes(x = median(data_long_greece$Year), 
                y = max(data_long_greece$Value, na.rm = TRUE), 
                label = cor_text), 
            vjust = 1.5, hjust = -.5, color = "black", size = 3.5)




# Calculate correlation
correlation <- cor(data_Greece$GTD, data_Greece$SIA, use="complete.obs")

# Print the correlation
print(correlation)



 ## Appendix 2: The number of incidents per year
# Step 1: Calculate the total incidents per year for the "Total" variable
mydata_total <- mydata %>%
  group_by(Year) %>%
  summarise(TotalIncidents = sum(Total, na.rm = TRUE), .groups = "drop")

# Step 2: Plot the bar graph with the count on top of each bar
ggplot(mydata_total, aes(x = factor(Year), y = TotalIncidents)) +
  geom_bar(stat = "identity", fill = "blue") +
  geom_text(aes(label = TotalIncidents), vjust = -0.2, color = "black", size = 3.1) +  # Add count on top of each bar
  labs(title = "",
       x = "Year",
       y = "Incidents") +
  theme_minimal() +
  scale_x_discrete(breaks = mydata_total$Year, labels = mydata_total$Year) +  # Include all years on x-axis
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5))  # Rotate x-axis labels vertically

## Appendix 3
# 1. Exclude 'Unknown' and NAs, Aggregate, and 2. Filter
top_locations <- mydata %>%
  filter(!is.na(Location) & Location != "Unknown") %>%  # Exclude NAs and 'Unknown'
  group_by(Location) %>%
  summarise(Incidents = sum(Total, na.rm = TRUE)) %>%
  arrange(-Incidents) %>%
  head(10) %>%
  ungroup()

# 3. Plot with gradual color
ggplot(top_locations, aes(x = reorder(Location, Incidents), y = Incidents, fill = Incidents)) + 
  geom_bar(stat = "identity", alpha = 0.8) +
  scale_fill_viridis_c(trans = "sqrt", direction = -1) +  # Using a color gradient based on frequency. 
  coord_flip() +
  labs(title = "",
       x = "Location", 
       y = "Incidents") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 10))






 ## Appendix 4
### Different periods

# Step 1: Calculate the total incidents per year for the "Total" variable
mydata_total <- mydata %>%
  group_by(Year) %>%
  summarise(TotalIncidents = sum(Total, na.rm = TRUE), .groups = "drop")

# Step 2: Calculate the average number for years 1993-1997, 1998-2007, and 2008-2015
avg_period1 <- mydata_total %>%
  filter(Year >= 1987 & Year <= 1992) %>%
  summarise(AverageIncidents = mean(TotalIncidents))

avg_period2 <- mydata_total %>%
  filter(Year >= 1993 & Year <= 1997) %>%
  summarise(AverageIncidents = mean(TotalIncidents))

avg_period3 <- mydata_total %>%
  filter(Year >= 1998 & Year <= 2004) %>%
  summarise(AverageIncidents = mean(TotalIncidents))

avg_period4 <- mydata_total %>%
  filter(Year >= 2005 & Year <= 2015) %>%
  summarise(AverageIncidents = mean(TotalIncidents))

# Step 3: Display the average numbers
cat("Average number of incidents for years 1987-1992:", avg_period1$AverageIncidents, "\n")
cat("Average number of incidents for years 1993-1997:", avg_period2$AverageIncidents, "\n")
cat("Average number of incidents for years 1998-2004:", avg_period3$AverageIncidents, "\n")
cat("Average number of incidents for years 2005-2015:", avg_period4$AverageIncidents, "\n")

# Plotting 
ggplot(mydata_total, aes(x = Year, y = TotalIncidents)) +
  geom_line() +
  geom_point() +
  geom_vline(xintercept = c(1992, 1997, 2004, 2015), color = "grey50", linetype="dashed", size = 0.8) + 
  geom_text(aes(x = (1987 + 1992) / 2, y = max(TotalIncidents) * 1.05, label = sprintf("Avg: %.2f", avg_period1$AverageIncidents)), size = 3) + # Adjusted size
  geom_text(aes(x = (1993 + 1997) / 2, y = max(TotalIncidents) * 1.05, label = sprintf("Avg: %.2f", avg_period2$AverageIncidents)), size = 3) + # Adjusted size
  geom_text(aes(x = (1998 + 2004) / 2, y = max(TotalIncidents) * 1.05, label = sprintf("Avg: %.2f", avg_period3$AverageIncidents)), size = 3) + # Adjusted size
  geom_text(aes(x = (2005 + 2015) / 2, y = max(TotalIncidents) * 1.05, label = sprintf("Avg: %.2f", avg_period4$AverageIncidents)), size = 3) + # Adjusted size
  labs(title = "", 
       y = "Incidents", 
       x = "Year") +
  theme_minimal() + 
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey90"),
        panel.grid.minor = element_blank(),
        axis.text = element_text(color = "black"),
        axis.title = element_text(color = "black"),
        plot.title = element_text(color = "black"))

table(mydata$Actor_ideology)






### Appendix 5

# Extracting the coefficients from the second model
coef_model_nb2 <- coef(model_nb2)

# Compute Incident Rate Ratios
irr_model_nb2 <- exp(coef_model_nb2)

# Print the IRRs
print(irr_model_nb2)


# Appendix 6: Violent vs. Non-violent behavior
# Filter the data for the desired ideologies

filtered_data <- mydata %>%
  filter(Actor_ideology %in% c("Leftist", "Islamic", "Nationalist")) %>%
  group_by(Actor_ideology, Violent) %>%
  summarize(count = n()) %>%
  mutate(percentage = count/sum(count) * 100)

# Convert 'Violent' variable to a factor for better labeling in the plot
filtered_data$Violent <- factor(filtered_data$Violent, levels = c(0, 1), labels = c("Non-Violent", "Violent"))

# Create an enhanced bar plot
ggplot(filtered_data, aes(x = Actor_ideology, y = percentage, fill = Violent)) + 
  geom_bar(stat = "identity", position = "dodge") + 
  geom_text(aes(label=sprintf("%.1f%%", percentage)), vjust=-0.25, position = position_dodge(width=0.9)) +
  labs(title = "",
       x = " Ideology",
       y = "Percentage") + 
  scale_fill_brewer(palette="Set2") +
  theme_minimal() + 
  theme(legend.title = element_blank())




# Appendix 7: Ideology and target type

# Filter the data
filtered_data <- mydata %>%
  dplyr::filter(Actor_ideology %in% c("Islamic", "Leftist", "Nationalist"))

# Load necessary libraries
library(dplyr)
library(tidyr)
library(ggplot2)
library(RColorBrewer)

# Aggregate Data
agg_data <- aggregate(. ~ Actor_ideology, data = filtered_data[, c("Actor_ideology", "Business", "Government", "Military", "Private")], sum)

# Reshape Data
long_agg_data <- agg_data %>%
  pivot_longer(cols = c(Business, Government, Military, Private), names_to = "Target", values_to = "Count")

# Compute Percentages
long_agg_data <- long_agg_data %>%
  group_by(Actor_ideology) %>%
  mutate(Percentage = Count / sum(Count) * 100)

# Plotting
ggplot(long_agg_data, aes(x = Actor_ideology, y = Percentage, fill = Target)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "",
       x = "Ideology",
       y = "Percentage") +
  scale_fill_brewer(palette = "Spectral") +
  theme_minimal() +
  theme(legend.title = element_blank())




